Data Load

In [1]:
import numpy as np
import pandas as pd    
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import neighbors
from sklearn import preprocessing
from sklearn import tree
from sklearn import metrics
from sklearn import model_selection
from sklearn import linear_model
from sklearn import feature_selection
from sklearn import tree
from sklearn import ensemble
from sklearn import preprocessing
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix,classification_report
from tempfile import TemporaryFile
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.preprocessing import label_binarize
from sklearn import svm 
from itertools import cycle
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split


df=pd.read_csv("C:/Users/Apratim/Desktop/C/ml/datasets/handset_segmentation/train.csv")

Data Study & Analysis

In [3]:
def brief_info():
    print(df.info())
    print(df.shape)
    print(df.head())
    print(df.tail())
    print('Discrete columns are:')
    for cols in df.columns:
        if (df[cols].dtypes)=='int64':
            print(cols,'|',end=' ')
    print('\n__________________________')
    print('Continous Columns are:')
    for cols in df.columns:
        if (df[cols].dtypes)!='int64':
            print(cols)
    return df.describe()
brief_info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
battery_power    2000 non-null int64
blue             2000 non-null int64
clock_speed      2000 non-null float64
dual_sim         2000 non-null int64
fc               2000 non-null int64
four_g           2000 non-null int64
int_memory       2000 non-null int64
m_dep            2000 non-null float64
mobile_wt        2000 non-null int64
n_cores          2000 non-null int64
pc               2000 non-null int64
px_height        2000 non-null int64
px_width         2000 non-null int64
ram              2000 non-null int64
sc_h             2000 non-null int64
sc_w             2000 non-null int64
talk_time        2000 non-null int64
three_g          2000 non-null int64
touch_screen     2000 non-null int64
wifi             2000 non-null int64
price_range      2000 non-null int64
dtypes: float64(2), int64(19)
memory usage: 328.2 KB
None
(2000, 21)
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range  
0        0             0     1            1  
1        1             1     0            2  
2        1             1     0            2  
3        1             0     0            2  
4        1             1     0            1  

[5 rows x 21 columns]
      battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  \
1995            794     1          0.5         1   0       1           2   
1996           1965     1          2.6         1   0       0          39   
1997           1911     0          0.9         1   1       1          36   
1998           1512     0          0.9         0   4       1          46   
1999            510     1          2.0         1   5       1          45   

      m_dep  mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  \
1995    0.8        106        6  ...       1222      1890   668    13     4   
1996    0.2        187        4  ...        915      1965  2032    11    10   
1997    0.7        108        8  ...        868      1632  3057     9     1   
1998    0.1        145        5  ...        336       670   869    18    10   
1999    0.9        168        6  ...        483       754  3919    19     4   

      talk_time  three_g  touch_screen  wifi  price_range  
1995         19        1             1     0            0  
1996         16        1             1     1            2  
1997          5        1             1     0            3  
1998         19        1             1     1            0  
1999          2        1             1     1            3  

[5 rows x 21 columns]
Discrete columns are:
battery_power | blue | dual_sim | fc | four_g | int_memory | mobile_wt | n_cores | pc | px_height | px_width | ram | sc_h | sc_w | talk_time | three_g | touch_screen | wifi | price_range | 
__________________________
Continous Columns are:
clock_speed
m_dep
Out[3]:
battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt n_cores ... px_height px_width ram sc_h sc_w talk_time three_g touch_screen wifi price_range
count 2000.000000 2000.0000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 ... 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000
mean 1238.518500 0.4950 1.522250 0.509500 4.309500 0.521500 32.046500 0.501750 140.249000 4.520500 ... 645.108000 1251.515500 2124.213000 12.306500 5.767000 11.011000 0.761500 0.503000 0.507000 1.500000
std 439.418206 0.5001 0.816004 0.500035 4.341444 0.499662 18.145715 0.288416 35.399655 2.287837 ... 443.780811 432.199447 1084.732044 4.213245 4.356398 5.463955 0.426273 0.500116 0.500076 1.118314
min 501.000000 0.0000 0.500000 0.000000 0.000000 0.000000 2.000000 0.100000 80.000000 1.000000 ... 0.000000 500.000000 256.000000 5.000000 0.000000 2.000000 0.000000 0.000000 0.000000 0.000000
25% 851.750000 0.0000 0.700000 0.000000 1.000000 0.000000 16.000000 0.200000 109.000000 3.000000 ... 282.750000 874.750000 1207.500000 9.000000 2.000000 6.000000 1.000000 0.000000 0.000000 0.750000
50% 1226.000000 0.0000 1.500000 1.000000 3.000000 1.000000 32.000000 0.500000 141.000000 4.000000 ... 564.000000 1247.000000 2146.500000 12.000000 5.000000 11.000000 1.000000 1.000000 1.000000 1.500000
75% 1615.250000 1.0000 2.200000 1.000000 7.000000 1.000000 48.000000 0.800000 170.000000 7.000000 ... 947.250000 1633.000000 3064.500000 16.000000 9.000000 16.000000 1.000000 1.000000 1.000000 2.250000
max 1998.000000 1.0000 3.000000 1.000000 19.000000 1.000000 64.000000 1.000000 200.000000 8.000000 ... 1960.000000 1998.000000 3998.000000 19.000000 18.000000 20.000000 1.000000 1.000000 1.000000 3.000000

8 rows × 21 columns

In [4]:
l4=[]
def seperator():
    l1=[]
    m=df.columns
    for i in range(len(m)):
        l1.append(np.array(df[m[i]]))
    l2=[]
    for j in range(len(l1)):
        l2.append(np.unique(l1[j]))
    l3=[]
    for i in range(len(l2)):
        if len(l2[i])==2:
            l3.append(i)
    for i in range(len(l3)):
        l4.append(m[l3[i]])
    l5=list(set(df.columns)-set(l4))
    print("Boolean valued columns are:",l4)
    print("______________________")
    print("Rest of the columns are:",l5)
    print("______________________")
    for item in l4:
        print(df[item].value_counts())
seperator()
Boolean valued columns are: ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
______________________
Rest of the columns are: ['n_cores', 'battery_power', 'pc', 'fc', 'ram', 'px_width', 'sc_h', 'm_dep', 'mobile_wt', 'talk_time', 'int_memory', 'price_range', 'clock_speed', 'sc_w', 'px_height']
______________________
0    1010
1     990
Name: blue, dtype: int64
1    1019
0     981
Name: dual_sim, dtype: int64
1    1043
0     957
Name: four_g, dtype: int64
1    1523
0     477
Name: three_g, dtype: int64
1    1006
0     994
Name: touch_screen, dtype: int64
1    1014
0     986
Name: wifi, dtype: int64
In [5]:
X=df.drop(columns=["price_range"])
y=df.price_range

Visualization & Graphical Analysis

In [6]:
sns.pairplot(data=df,hue='price_range')
C:\Users\Apratim\Anaconda3\desktop\lib\site-packages\statsmodels\nonparametric\kde.py:488: RuntimeWarning: invalid value encountered in true_divide
  binned = fast_linbin(X, a, b, gridsize) / (delta * nobs)
C:\Users\Apratim\Anaconda3\desktop\lib\site-packages\statsmodels\nonparametric\kdetools.py:34: RuntimeWarning: invalid value encountered in double_scalars
  FAC1 = 2*(np.pi*bw/RANGE)**2
Out[6]:
<seaborn.axisgrid.PairGrid at 0x1eaa6fb3d68>
In [7]:
def plot():
    for i in range(len(l4)):
        sns.countplot(x=l4[i],hue='price_range',data=df)
        plt.show()
plot()
In [8]:
def showplot():
    l=['ram','int_memory','px_height','talk_time']
    for i in l:
        fig=sns.FacetGrid(data=df,hue="price_range")
        fig.map(sns.kdeplot,i)
        fig.add_legend()
        plt.show()
showplot()
In [10]:
sns.jointplot(x='ram',y='price_range',data=df,color='red',kind='kde')
Out[10]:
<seaborn.axisgrid.JointGrid at 0x1eac6a60c88>
In [12]:
sns.jointplot(x='int_memory',y='price_range',data=df,color='blue',kind='kde')
Out[12]:
<seaborn.axisgrid.JointGrid at 0x1eac7afaf60>
In [13]:
sns.pointplot(y='px_height',x='price_range',data=df)
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x1eac7d516d8>
In [14]:
sns.pointplot(y='clock_speed',x='price_range',data=df)
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x1eac7d926a0>
In [15]:
sns.jointplot(x='price_range',y='clock_speed',data=df,color='green',kind='kde')
Out[15]:
<seaborn.axisgrid.JointGrid at 0x1eac7dcbc50>
In [16]:
sns.pointplot(y='talk_time',x='price_range',data=df)
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x1eac7e6c550>
In [17]:
sns.boxplot(x="price_range",y="battery_power",data=df)
plt.show()
In [18]:
labels4g=["4g_supported","Not_supported"]
values4g=df["four_g"].value_counts().values
fig1,ax1=plt.subplots()
ax1.pie(values4g,labels=labels4g,shadow=True,startangle=90)
plt.show()
In [19]:
plt.figure(figsize=(10,6))
df["fc"].hist(alpha=0.5,color="blue",label="Front Camera")
df["pc"].hist(alpha=0.5,color="red",label="Primary Camera")
plt.legend()
plt.xlabel("MegaPixels")
Out[19]:
Text(0.5, 0, 'MegaPixels')
In [20]:
plt.figure(figsize=(10,6))
df["px_height"].hist(alpha=0.5,color="yellow",label="Pixel Height")
df["px_width"].hist(alpha=0.5,color="pink",label="Pixel Width")
plt.legend()
plt.xlabel("Pixels")
Out[20]:
Text(0.5, 0, 'Pixels')
In [21]:
plt.figure(figsize=(10,6))
df["sc_h"].hist(alpha=0.5,color="green",label="Screen Height")
df["sc_w"].hist(alpha=0.5,color="red",label="Screen Width")
plt.legend()
plt.xlabel("Screen Size")
Out[21]:
Text(0.5, 0, 'Screen Size')

Creating & training models

In [22]:
X1=preprocessing.scale(X,with_mean=True,with_std=True,copy=True)
df1=pd.DataFrame(X1)
df1.columns=X.columns
X2=df1
C:\Users\Apratim\Anaconda3\desktop\lib\site-packages\ipykernel_launcher.py:1: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by the scale function.
  """Entry point for launching an IPython kernel.

Feature Selection

In [23]:
def mutual(X,y):
    mu=feature_selection.mutual_info_classif(X,y)
    muser=pd.Series(mu)
    muser.index=df1.columns.values
    muser.sort_values(ascending=False).plot.bar()
mutual(X2,y)
In [24]:
list1=[]
def feature(n):
    bestfeatures=SelectKBest(k=n)
    fit = bestfeatures.fit(X2,y)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(df1.columns)
    #concat two dataframes for better visualization 
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Specs','Score']  #naming the dataframe columns
    f1=featureScores.nlargest(n,'Score')
    f1.plot(kind='bar')
    plt.show()
    set1=set(f1["Specs"])
    #Feature importance method
    print("==============================================")
    model1 = ExtraTreesClassifier()
    model1.fit(X2,y)
    #use inbuilt class feature_importances of tree based classifiers
    #plot graph of feature importances for better visualization
    feat_importances = pd.Series(model1.feature_importances_,index=df1.columns)
    f2=feat_importances.nlargest(n)
    set2=set(f2.index)
    f2.plot(kind='bar')
    plt.show()
    set3=set1.intersection(set2)
    set4=set(df.columns)
    lset=list(set4.difference(set3))
    for item in lset:
        list1.append(item)

feature(5)
==============================================
C:\Users\Apratim\Anaconda3\desktop\lib\site-packages\sklearn\ensemble\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)

Train_test_split

In [25]:
list1.remove("price_range")
X2=df1.drop(columns=list1)
##Train,test split
Xtrain,Xtest,ytrain,ytest=model_selection.train_test_split(X2,y,test_size=.40,random_state=42)
model=linear_model.LogisticRegression()
treemodel=tree.DecisionTreeClassifier(max_depth=2)
gnbobj=naive_bayes.GaussianNB()
In [32]:
rfmodel=ensemble.RandomForestClassifier()
b=[]
def best_n(n):
    param_grid={'n_estimators': [200,n]}
    CV_rfmodel=GridSearchCV(estimator=rfmodel,param_grid=param_grid,cv=5)
    CV_rfmodel.fit(Xtrain, ytrain)
    b.append(CV_rfmodel.best_params_['n_estimators'])
best_n(500)
rfmodel=ensemble.RandomForestClassifier(n_estimators=b[0])

Model - fit - Predict

In [27]:
a=[]
def best_k(n):
    k=[l for l in range(5,n,2)]
    knnobj=neighbors.KNeighborsClassifier(n_neighbors=k)
    grid={"n_neighbors":k}
    grid_obj = GridSearchCV(estimator=knnobj,param_grid=grid)
    grid_fit =grid_obj.fit(Xtrain,ytrain)
    knnobj = grid_fit.best_estimator_
    knnobj.fit(Xtrain,ytrain)
    a.append(grid_fit.best_params_['n_neighbors'])
best_k(17)
knnobj=neighbors.KNeighborsClassifier(n_neighbors=a[0])
C:\Users\Apratim\Anaconda3\desktop\lib\site-packages\sklearn\model_selection\_split.py:2053: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
  warnings.warn(CV_WARNING, FutureWarning)
In [28]:
def cross_entropy(y,p):
    m = y.shape[0]
    log_likelihood = -np.log(p[range(m),y])
    loss = np.sum(log_likelihood)/m
    return loss
In [29]:
def multiclass_roc_auc_score(ytest,testp, average="macro"):
    lb = LabelBinarizer()
    lb.fit(ytest)
    ytest = lb.transform(ytest)
    testp = lb.transform(testp)
    return roc_auc_score(ytest,testp,average=average)
In [30]:
li=[]
li1=[]
li2=[]
def model_built(m):
    m.fit(Xtrain,ytrain)
    testp=m.predict(Xtest)
    print('Accuracy:',metrics.accuracy_score(ytest,testp))
    li2.append(metrics.accuracy_score(ytest,testp))
    print(classification_report(ytest,testp))
    m2=metrics.confusion_matrix(ytest,testp)
    print(m2)
    p=m.predict_proba(Xtest)
    print("Loss:",cross_entropy(ytest,p))
    print("\nroc_auc_score:",multiclass_roc_auc_score(ytest,testp))
    print("                                         ")
    li.append(cross_entropy(ytest,p))
    li1.append(multiclass_roc_auc_score(ytest,testp))
    print("=======================================")
    plt.figure(figsize=(6,6))
    sns.heatmap(m2,annot=True)
    plt.show()
    print(m)
In [31]:
def fit():
    l6=[model,treemodel,knnobj,gnbobj,rfmodel]
    for item in l6:
        print("                                         ")
        model_built(item)
        print("==========================================")
    dff=pd.DataFrame({"Cross Entropy Loss":li,"Roc_Auc":li1,"Accuracy":li2},index=['model','treemodel','knnobj','gnbobj','rfmodel'])
    print("                                                ")
    print("The Entropy Loss of the best model is :",dff["Cross Entropy Loss"].min(),dff[["Cross Entropy Loss"]].idxmin())
    print("The max auc score:",dff["Roc_Auc"].max(),dff[["Roc_Auc"]].idxmax())
    print("The max accuracy :",dff["Accuracy"].max(),dff[["Accuracy"]].idxmax())
    li.clear()
    li1.clear()
    li2.clear()
    return dff

fit()
                                         
Accuracy: 0.8625
              precision    recall  f1-score   support

           0       0.91      1.00      0.96       202
           1       0.81      0.75      0.78       194
           2       0.82      0.70      0.76       202
           3       0.88      1.00      0.94       202

   micro avg       0.86      0.86      0.86       800
   macro avg       0.86      0.86      0.86       800
weighted avg       0.86      0.86      0.86       800

[[202   0   0   0]
 [ 19 145  30   0]
 [  0  33 141  28]
 [  0   0   0 202]]
Loss: 0.6044832181396965

roc_auc_score: 0.9077780618802493
                                         
=======================================
C:\Users\Apratim\Anaconda3\desktop\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\Users\Apratim\Anaconda3\desktop\lib\site-packages\sklearn\linear_model\logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.
  "this warning.", FutureWarning)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
==========================================
                                         
Accuracy: 0.76875
              precision    recall  f1-score   support

           0       0.90      0.79      0.84       202
           1       0.66      0.80      0.73       194
           2       0.71      0.63      0.67       202
           3       0.82      0.85      0.83       202

   micro avg       0.77      0.77      0.77       800
   macro avg       0.78      0.77      0.77       800
weighted avg       0.78      0.77      0.77       800

[[160  42   0   0]
 [ 17 156  21   0]
 [  0  37 128  37]
 [  0   0  31 171]]
Loss: 0.6305865529357235

roc_auc_score: 0.8460975472094671
                                         
=======================================
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
==========================================
                                         
Accuracy: 0.88375
              precision    recall  f1-score   support

           0       0.95      0.94      0.95       202
           1       0.82      0.90      0.86       194
           2       0.84      0.80      0.82       202
           3       0.92      0.90      0.91       202

   micro avg       0.88      0.88      0.88       800
   macro avg       0.88      0.88      0.88       800
weighted avg       0.89      0.88      0.88       800

[[189  13   0   0]
 [  9 174  11   0]
 [  0  25 162  15]
 [  0   0  20 182]]
Loss: 0.3405525357380037

roc_auc_score: 0.9226051953883455
                                         
=======================================
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=15, p=2,
           weights='uniform')
==========================================
                                         
Accuracy: 0.805
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       202
           1       0.75      0.75      0.75       194
           2       0.69      0.70      0.70       202
           3       0.85      0.86      0.85       202

   micro avg       0.81      0.81      0.81       800
   macro avg       0.81      0.80      0.80       800
weighted avg       0.81      0.81      0.81       800

[[183  19   0   0]
 [ 14 146  34   0]
 [  0  29 142  31]
 [  0   0  29 173]]
Loss: 0.4476171337910448

roc_auc_score: 0.8697642408450064
                                         
=======================================
GaussianNB(priors=None, var_smoothing=1e-09)
==========================================
                                         
Accuracy: 0.91125
              precision    recall  f1-score   support

           0       0.95      0.94      0.95       202
           1       0.86      0.93      0.90       194
           2       0.90      0.84      0.87       202
           3       0.93      0.93      0.93       202

   micro avg       0.91      0.91      0.91       800
   macro avg       0.91      0.91      0.91       800
weighted avg       0.91      0.91      0.91       800

[[190  12   0   0]
 [  9 181   4   0]
 [  0  17 170  15]
 [  0   0  14 188]]
Loss: 0.24587803330632652

roc_auc_score: 0.94097150983243
                                         
=======================================
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
==========================================
                                                
The Entropy Loss of the best model is : 0.24587803330632652 Cross Entropy Loss    rfmodel
dtype: object
The max auc score: 0.94097150983243 Roc_Auc    rfmodel
dtype: object
The max accuracy : 0.91125 Accuracy    rfmodel
dtype: object
Out[31]:
Cross Entropy Loss Roc_Auc Accuracy
model 0.604483 0.907778 0.86250
treemodel 0.630587 0.846098 0.76875
knnobj 0.340553 0.922605 0.88375
gnbobj 0.447617 0.869764 0.80500
rfmodel 0.245878 0.940972 0.91125
In [ ]: